Merck DID WDI example notebook

The Merck DID non-proprietary data is licensed CC0. This is an initial exploration of the data for uploading into Wikidata

Import all necessary modules, data files, and paths


In [1]:
from wikidataintegrator import wdi_core, wdi_login
from wikidataintegrator.ref_handlers import update_retrieved_if_new_multiple_refs
import pandas as pd
from pandas import read_csv
import requests
from tqdm.notebook import trange, tqdm
import ipywidgets 
import widgetsnbextension

datasrc = 'https://static-content.springer.com/esm/art%3A10.1186%2Fs13326-016-0110-0/MediaObjects/13326_2016_110_MOESM1_ESM.xlsx'

In [2]:
def check_wd(wd_property, searchlist):
    items_in_wd = []
    search_failures = []
    i=0
    for i in tqdm(range(len(search_list))):
        each_item = search_list[i]
        try:
            sparqlQuery = "SELECT * WHERE {?item wdt:"+wd_property+"\""+each_item+"\"}"
            result = wdi_core.WDItemEngine.execute_sparql_query(sparqlQuery)
            k=0
            while k < len(result["results"]["bindings"]): ## Take into account that there may be one to many mappings
                subject_qid = result["results"]["bindings"][k]["item"]["value"].replace("http://www.wikidata.org/entity/", "")
                tmpdict = {'item':each_item,'WDID':subject_qid}
                items_in_wd.append(tmpdict)
                k=k+1
        except:
            search_failures.append(each_item)
        i=i+1
    items_wd_df = pd.DataFrame(items_in_wd) 
    return(items_wd_df, search_failures)

#### Unit Test
search_list = ['C0016157','C0016004','C0021579','C0076425','C0025598','15307-86-5','1474034-05-3']
wd_property = 'P2892' ##WD Property for UMLS CUI
items_wd_df, search_failures = check_wd(wd_property, search_list)
if len(items_wd_df)>0:
    print('function successfully loaded and tested')


function successfully loaded and tested

In [3]:
did_raw = pd.read_excel(datasrc, header=[0, 1, 2], sheetname='DID')
did_raw.reset_index(inplace=True)
did_raw.columns = did_raw.columns.map(lambda h: ' '.join(h).replace(' ', '_'))

In [4]:
did_raw.rename(columns={'index__':'DID_ID','source_Unnamed:_0_level_1_source_name':'source_name',
                       'source_Unnamed:_1_level_1_source_record_ID_or_other_metadata':'source_record_id',
                       'drug_Unnamed:_2_level_1_raw_drug_name':'raw_drug_name',
                       'drug_CAS#_Preferred_CAS#':'drug_cas#', 'drug_CAS#_PT':'drug_cas_pt',
                       'drug_CAS#_source':'drug_cas_source', 'drug_CAS#_match_type':'drug_cas_match',
                       'drug_ChEBI_PT_("name")':'drug_ChEBI_PT_name', 'drug_ChEBI_ChEBI_ID#':'drug_ChEBI_ID', 
                       'drug_ChEBI_PT_match_type':'drug_ChEBI_PT_match', 
                       'drug_ChEBI_synonym_if_used_for_match':'drug_ChEBI_match_syn', 
                       'drug_ChEBI_synonym_match_type':'drug_ChEBI_synonym_match', 
                       'drug_ChEBI_match_aid_if_any':'drug_ChEBI_match_aid', 
                       'drug_CHEMID+_PT_(DisplayName[-]_>_DisplayName)':'drug_CHEMID_PT_DisplayName', 
                       'drug_CHEMID+_DisplayName_if_diff':'drug_CHEMID_DisplayName', 
                       'drug_CHEMID+_CAS#_or_ID':'drug_CHEMID_ID', 'drug_CHEMID+_PT_match_type':'drug_CHEMID_PT_match', 
                       'drug_CHEMID+_synonym_if_used_for_match':'drug_CHEMID_match_syn', 
                       'drug_CHEMID+_synonym_match_type':'drug_CHEMID_synonym_match', 
                       'drug_CHEMID+_match_aid_if_any':'drug_CHEMID_match_aid', 
                       'drug_CTD_PT_(ChemicalName)':'drug_CTD_PT_Name', 'drug_CTD_MESH_ID':'drug_CTD_MESH', 
                       'drug_CTD_CAS#':'drug_CTD_CAS', 'drug_CTD_PT_match_type':'drug_CTD_PT_match', 
                       'drug_CTD_synonym_if_used_for_match':'drug_CTD_match_syn', 
                       'drug_CTD_synonym_match_type':'drug_CTD_synonym_match', 
                       'drug_CTD_match_aid_if_any':'drug_CTD_match_aid', 
                       'drug_UMLS_PT_match_type':'drug_UMLS_PT_match', 
                       'drug_UMLS_synonym_if_used_for_match':'drug_UMLS_match_syn', 
                       'drug_UMLS_synonym_match_type':'drug_UMLS_synonym_match', 
                       'drug_UMLS_match_aid_if_any':'drug_UMLS_match_aid', 
                       'drug_UMLS_semantic_type_1':'drug_UMLS_sem_1', 'drug_UMLS_semantic_type_2':'drug_UMLS_sem_2', 
                       'drug_UMLS_semantic_type_3':'drug_UMLS_sem_3', 'drug_UMLS_semantic_type_4':'drug_UMLS_sem_4', 
                       'indication_subtype_(predicate)_Unnamed:_38_level_1_raw':'predicate_raw',
                       'indication_subtype_(predicate)_Unnamed:_39_level_1_aggregate_1':'predicate_aggregate', 
                       'indication_subtype_(predicate)_Unnamed:_40_level_1_string_search_[temp]':'predicate_string', 
                       'indication_raw_value_entire_value/string':'indication_raw_string',
                       'indication_raw_value_target/substring':'indication_target_substring', 
                       'indication_raw_value_target=entire_string?':'indication_entire_string?', 
                       'indication_UMLS_phenotype_entry_term_match_type':'umls_phen_term_match', 
                       'indication_UMLS_phenotype_entry_term':'umls_phen_term', 
                       'indication_UMLS_phenotype_PT':'umls_phen_PT', 
                       'indication_UMLS_phenotype_CUI':'umls_phen_cui', 
                       'indication_UMLS_phenotype_entry_term_type':'umls_phen_type', 
                       'indication_UMLS_phenotype_phenotype?':'umls_phen_phen', 
                       'indication_UMLS_phenotype_semantic_type_1':'umls_phen_sem_1', 
                       'indication_UMLS_phenotype_semantic_type_2':'umls_phen_sem_2', 
                       'indication_UMLS_phenotype_semantic_type_3':'umls_phen_sem_3', 
                       'indication_UMLS_phenotype_semantic_type_4':'umls_phen_sem_4', 
                       'indication_UMLS_initial,_if_different_entry_term_match_type':'umls_init_term_match', 
                       'indication_UMLS_initial,_if_different_entry_term':'umls_init_term', 
                       'indication_UMLS_initial,_if_different_PT':'umls_init_PT', 
                       'indication_UMLS_initial,_if_different_CUI':'umls_init_cui', 
                       'indication_UMLS_initial,_if_different_entry_term_type':'umls_init_type', 
                       'indication_UMLS_initial,_if_different_semantic_type_1':'umls_init_sem_1', 
                       'indication_UMLS_initial,_if_different_semantic_type_2':'umls_init_sem_2', 
                       'indication_UMLS_initial,_if_different_semantic_type_3':'umls_init_sem_3', 
                       'indication_UMLS_initial,_if_different_semantic_type_4':'umls_init_sem_4'}, inplace=True)

Clean up the results for items with greatest import potential

  1. Remove entries that don't have any predicates
  2. Keep only entries where the drug mapping was 'exact' or a synonym for the drug was considered 'exact'
  3. Keep only entries where the phenotype mapping is exact
  4. Remove entries where there is a predicate, but the predicate is 'marker or mechanism' (not available in WD)
  5. Subset entries by availability of more specific identifiers (CAS RNs have one to many mappings in Wikidata)

In [5]:
## Filter entries down
clean_complete_data = did_raw.loc[(did_raw['predicate_raw'].notnull()) & 
                                  ((did_raw['drug_UMLS_PT_match']=='exact')|(did_raw['drug_UMLS_synonym_match']=='exact')) &
                                  ((did_raw['umls_phen_term_match'].str.contains('exact'))|(did_raw['umls_init_term_match'].str.contains('exact')))]

clean_complete_less_markers = clean_complete_data[(clean_complete_data['predicate_raw']!='marker/mechanism') &
                                                  (clean_complete_data['umls_phen_phen']=='Y')]

#print(clean_complete_less_markers[['drug_UMLS_PT','drug_UMLS_CUI','predicate_raw','predicate_aggregate',
#                                   'umls_phen_PT','umls_phen_term_match','umls_phen_cui','umls_init_term_match']].head(n=2))

## Subset to entries with more specific identifiers
clean_complete_chebi = clean_complete_less_markers.loc[clean_complete_less_markers['drug_ChEBI_ID'].notnull()]
print(len(clean_complete_chebi))
print(len(clean_complete_less_markers))


40287
59038

Check for appropriate entities in Wikidata


In [12]:
## Check UMLS drug cui coverage in Wikidata
drug_umls_cuis_clean = clean_complete_less_markers['drug_UMLS_CUI'].unique().tolist()
search_list = drug_umls_cuis_clean
wd_property = 'P2892' ##WD Property for UMLS CUI
drugs_wd_df_cui, search_failures = check_wd(wd_property, search_list)
drugs_wd_df_cui.rename(columns={'item':'drug'},inplace=True)
#drugs_wd_df_cui.to_csv('results/drugs_by_cui_wd_df.tsv',sep='\t',header=True)




In [13]:
## Check CAS RN drug coverage in Wikidata
drug_cas_clean = clean_complete_less_markers['drug_cas#'].unique().tolist()
wd_property = 'P231' ##WD Property for CAS number
search_list = drug_cas_clean
drugs_wd_df_cas, cas_search_failures = check_wd(wd_property, search_list)
drugs_wd_df_cas.rename(columns={'item':'drug'},inplace=True)
#drugs_wd_df_cas.to_csv('results/results/drugs_by_cas_wd_df.tsv.tsv',sep='\t',header=True)




In [14]:
## Check for ChEBI drug coverage in Wikidata
chebi_list = clean_complete_chebi['drug_ChEBI_ID'].astype(int).astype(str).unique().tolist()
wd_property = 'P683' ##WD Property for CHEBI
search_list = chebi_list
drugs_wd_df_chebi, chebi_search_failures = check_wd(wd_property, search_list)
drugs_wd_df_chebi.rename(columns={'item':'drug'},inplace=True)
#drugs_wd_df_cui.to_csv('results/drugs_by_chebi_wd_df.tsv',sep='\t',header=True)




In [15]:
## Check for Phenotype CUI coverage in Wikidata
phen_umls_cuis_clean = clean_complete_less_markers['umls_phen_cui'].unique().tolist()
wd_property = 'P2892' ##WD Property for UMLS CUI
search_list = phen_umls_cuis_clean
phen_wd_df_cui, search_failures = check_wd(wd_property, search_list)
phen_wd_df_cui.rename(columns={'item':'phen'},inplace=True)
#phen_wd_df_cui.to_csv('results/phen_by_cui_wd_df.tsv',sep='\t',header=True)



Summary of initial data investigation


In [6]:
drugs_wd_df_cas = read_csv('results/drugs_by_cas_wd_df.tsv',delimiter='\t',header=0)
drugs_wd_df_chebi = read_csv('results/drugs_by_chebi_wd_df.tsv',delimiter='\t',header=0)
drugs_wd_df_cui = read_csv('results/drugs_by_cui_wd_df.tsv',delimiter='\t',header=0)
phen_wd_df_cui = read_csv('results/phen_by_cui_wd_df.tsv',delimiter='\t',header=0)

drugs_wd_df_cas.drop('Unnamed: 0',axis=1,inplace=True)
drugs_wd_df_chebi.drop('Unnamed: 0',axis=1,inplace=True)
drugs_wd_df_cui.drop('Unnamed: 0',axis=1,inplace=True)
phen_wd_df_cui.drop('Unnamed: 0',axis=1,inplace=True)

In [7]:
#### Summary
#print(did_raw.nunique()) ## The number of unique values for each column
print('number of DID_ID: ',len(did_raw['DID_ID'].unique()))
print('number of unique raw_drug_name: ',len(did_raw['raw_drug_name'].unique()))
print('number of unique drug umls_preferred_term:', len(did_raw['drug_UMLS_PT'].unique()))
print('number of raw predicates (not unique, not null):', len(did_raw.loc[did_raw['predicate_raw'].notnull()]))
print('number of unique umls preferred indication term:', len(did_raw['umls_phen_PT'].unique()))
print('number of DID entries with a predicate value:', len(clean_complete_data))
print('number of DID entries where the predicate is a "marker/mechanism":',len(clean_complete_data[clean_complete_data['predicate_raw']=='marker/mechanism']))
print("number of entries with predicate values that aren't 'marker/mechanism':",len(clean_complete_less_markers))
print('number of WD entities pulled by CAS number from DIDs with predicates:', len(drugs_wd_df_cas))
print('number of WD entities pulled by UMLS "drug" CUIS from DIDs with predicates:', len(drugs_wd_df_cui))
print('number of WD entities pulled by drug ChEBIs from DIDs with predicates:', len(drugs_wd_df_chebi))
print('number of WD entities pulled by UMLS "phenotype" CUIS from DIDs with predicates: ', len(phen_wd_df_cui))


number of DID_ID:  191111
number of unique raw_drug_name:  34137
number of unique drug umls_preferred_term: 21807
number of raw predicates (not unique, not null): 140181
number of unique umls preferred indication term: 6111
number of DID entries with a predicate value: 111618
number of DID entries where the predicate is a "marker/mechanism": 51282
number of entries with predicate values that aren't 'marker/mechanism': 59038
number of WD entities pulled by CAS number from DIDs with predicates: 9360
number of WD entities pulled by UMLS "drug" CUIS from DIDs with predicates: 2077
number of WD entities pulled by drug ChEBIs from DIDs with predicates: 4305
number of WD entities pulled by UMLS "phenotype" CUIS from DIDs with predicates:  1717

Check one-to-many and many-to-one mapping issues with for drugs


In [8]:
chk_wdid = drugs_wd_df_cas.groupby('WDID').size().reset_index(name='count')
chk_wdid_cas = drugs_wd_df_cas.groupby('drug').size().reset_index(name='count')

chk_cui_wdid = drugs_wd_df_cui.groupby('WDID').size().reset_index(name='count')
chk_cui_cui = drugs_wd_df_cui.groupby('drug').size().reset_index(name='count')

chk_chebi_wdid = drugs_wd_df_chebi.groupby('WDID').size().reset_index(name='count')
chk_chebi_chebi = drugs_wd_df_chebi.groupby('drug').size().reset_index(name='count')

chk_wdid_phen = phen_wd_df_cui.groupby('WDID').size().reset_index(name='count')
chk_wdid_phen_cui = phen_wd_df_cui.groupby('phen').size().reset_index(name='count')

print(len(chk_wdid.loc[chk_wdid['count']>1]),' drug WDIDs map to at least 2 CAS numbers.')
print(len(chk_wdid_cas.loc[chk_wdid_cas['count']>1]),' drug CAS numbers map to at least 2 WDID.')

print(len(chk_cui_wdid.loc[chk_cui_wdid['count']>1]),' drug WDIDs map to at least 2 CUIs.')
print(len(chk_cui_cui.loc[chk_cui_cui['count']>1]),' drug CUIs map to at least 2 WDID.')

print(len(chk_chebi_wdid.loc[chk_chebi_wdid['count']>1]),' drug WDIDs map to at least 2 CUIs.')
print(len(chk_chebi_chebi.loc[chk_chebi_chebi['count']>1]),' drug Chebis map to at least 2 WDIDs.')

print(len(chk_wdid_phen.loc[chk_wdid_phen['count']>1]),' phenotype WDIDs map to at least 2 UMLS CUIs.')
print(len(chk_wdid_phen_cui.loc[chk_wdid_phen_cui['count']>1]),' phenotype WDIDs map to at least 2 UMLS CUIs.')


1  drug WDIDs map to at least 2 CAS numbers.
70  drug CAS numbers map to at least 2 WDID.
0  drug WDIDs map to at least 2 CUIs.
3  drug CUIs map to at least 2 WDID.
4  drug WDIDs map to at least 2 CUIs.
4  drug Chebis map to at least 2 WDIDs.
42  phenotype WDIDs map to at least 2 UMLS CUIs.
119  phenotype WDIDs map to at least 2 UMLS CUIs.

Filter out entries with drugs/phenotypes that don't map to a Wikidata entry or have one-to-many mapping issues


In [9]:
multiple_mapping_issues_wdid = set(chk_wdid['WDID'].loc[chk_wdid['count']>1].unique().tolist()+
                                   chk_cui_wdid['WDID'].loc[chk_cui_wdid['count']>1].unique().tolist()+
                                   chk_wdid_phen['WDID'].loc[chk_wdid_phen['count']>1].unique().tolist()+
                                   chk_chebi_wdid['WDID'].loc[chk_chebi_wdid['count']>1].unique().tolist())

multiple_mapping_issues_cas = set(chk_wdid_cas['drug'].loc[chk_wdid_cas['count']>1].unique().tolist())
multiple_mapping_issues_chebi = set(chk_chebi_chebi['drug'].loc[chk_chebi_chebi['count']>1].unique().tolist())
multiple_mapping_issues_cui = set(chk_cui_cui['drug'].loc[chk_cui_cui['count']>1].unique().tolist()+
                                   chk_wdid_phen_cui['phen'].loc[chk_wdid_phen_cui['count']>1].unique().tolist())

cas_clean = drugs_wd_df_cas.loc[(~drugs_wd_df_cas['WDID'].isin(multiple_mapping_issues_wdid))&
                                (~drugs_wd_df_cas['drug'].isin(multiple_mapping_issues_cas))].copy()
chebi_clean = drugs_wd_df_chebi.loc[(~drugs_wd_df_chebi['WDID'].isin(multiple_mapping_issues_wdid))&
                                    (~drugs_wd_df_chebi['drug'].isin(multiple_mapping_issues_chebi))].copy()
cui_drug_clean = drugs_wd_df_cui.loc[(~drugs_wd_df_cui['WDID'].isin(multiple_mapping_issues_wdid))&
                                      (~drugs_wd_df_cui['drug'].isin(multiple_mapping_issues_cui))].copy()
cui_phen_clean = phen_wd_df_cui.loc[(~phen_wd_df_cui['WDID'].isin(multiple_mapping_issues_wdid))&
                                      (~phen_wd_df_cui['phen'].isin(multiple_mapping_issues_cui))].copy()

Subset the data to just items which have Wikidata entities and one-to-one mappings


In [10]:
tmp_slice = clean_complete_less_markers[['raw_drug_name','source_name','drug_UMLS_CUI','drug_ChEBI_ID','drug_cas#',
                                         'predicate_raw','predicate_aggregate','predicate_string',
                                         'indication_raw_string','umls_phen_cui','umls_phen_PT']]

cas_clean.rename(columns={'drug':'drug_cas#','WDID':'drug_cas_wdid'}, inplace=True)
chebi_clean.rename(columns={'drug':'drug_ChEBI_ID','WDID':'drug_chebi_wdid'}, inplace=True)
cui_drug_clean.rename(columns={'drug':'drug_UMLS_CUI','WDID':'drug_cui_wdid'}, inplace=True)
cui_phen_clean.rename(columns={'phen':'umls_phen_cui','WDID':'phen_cui_wdid'}, inplace=True)

cas_merged = tmp_slice.merge(cas_clean, on='drug_cas#', how='left')
chebi_merged = cas_merged.merge(chebi_clean, on='drug_ChEBI_ID', how='left')
drug_cui_merged = chebi_merged.merge(cui_drug_clean, on='drug_UMLS_CUI', how='left')
phen_merge = drug_cui_merged.merge(cui_phen_clean, on='umls_phen_cui', how='left')

potential_data_to_import = phen_merge.loc[(phen_merge['phen_cui_wdid'].notnull()) & 
                                         ((phen_merge['drug_cas_wdid'].notnull()) | 
                                          (phen_merge['drug_chebi_wdid'].notnull()) | 
                                          (phen_merge['drug_cui_wdid'].notnull()))]
print(len(potential_data_to_import))
print(potential_data_to_import.head(n=2))


24625
                      raw_drug_name source_name drug_UMLS_CUI drug_ChEBI_ID  \
7    thiazolidine-4-carboxylic acid     MeSH_PA      C0076425         64564   
10  1-hexylcarbamoyl-5-fluorouracil     MeSH_PA      C0044401           NaN   

     drug_cas# predicate_raw predicate_aggregate predicate_string  \
7     444-27-9          anti                 NaN              NaN   
10  61422-45-5          anti                 NaN              NaN   

    indication_raw_string umls_phen_cui         umls_phen_PT drug_cas_wdid  \
7   Antineoplastic Agents      C0006826  Malignant Neoplasms     Q23637400   
10  Antineoplastic Agents      C0006826  Malignant Neoplasms      Q5043732   

   drug_chebi_wdid drug_cui_wdid phen_cui_wdid  
7        Q23637400           NaN        Q12078  
10             NaN           NaN        Q12078  

Further subset the data to entries to ensure strict mapping to drug entities

  1. Determine number of entries where only a single identifier mapped
  2. Determine number of entries where multiple drug identifiers pulled different WDIDs for the same drug
  3. Subset the data for entries where a drug mapping was verified by mapping via at least 2 types of identifiers
  4. Subset the data for entries where a drug mapping was verified by mapping via at least 3 types of identifiers

In [11]:
## Single identifiers mapped
drug_wdid_single = potential_data_to_import.loc[(potential_data_to_import['drug_cas_wdid'].isnull() & 
                                                 potential_data_to_import['drug_chebi_wdid'].isnull()&
                                                 potential_data_to_import['drug_cas_wdid'].notnull())|
                                                (potential_data_to_import['drug_cas_wdid'].isnull() & 
                                                 potential_data_to_import['drug_chebi_wdid'].notnull()&
                                                 potential_data_to_import['drug_cas_wdid'].isnull())|
                                                (potential_data_to_import['drug_cas_wdid'].notnull() & 
                                                 potential_data_to_import['drug_chebi_wdid'].isnull()&
                                                 potential_data_to_import['drug_cas_wdid'].isnull())]

print('Potential DID entries for import with only a single drug WDID mapping:',len(drug_wdid_single))


## Identify data where the WDID mappings are contrary
drug_wdid_conflicting = potential_data_to_import.loc[((potential_data_to_import['drug_cas_wdid']!=potential_data_to_import['drug_chebi_wdid'])&
                                               (potential_data_to_import['drug_cas_wdid'].notnull())&(potential_data_to_import['drug_chebi_wdid'].notnull()))|
                                               ((potential_data_to_import['drug_cas_wdid']!=potential_data_to_import['drug_cui_wdid'])&
                                               (potential_data_to_import['drug_cas_wdid'].notnull())&(potential_data_to_import['drug_cui_wdid'].notnull()))|
                                               ((potential_data_to_import['drug_cui_wdid']!=potential_data_to_import['drug_chebi_wdid'])&
                                               (potential_data_to_import['drug_cui_wdid'].notnull())&(potential_data_to_import['drug_chebi_wdid'].notnull()))]
print('Potential DID entries for import with conflicting WDID mappings:',len(drug_wdid_conflicting))


## Further subset the data to entries where the drug is verified by mapping to the same WDID via two different identifiers
drug_wdid_strict = potential_data_to_import.loc[((potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_chebi_wdid'])&
                                               potential_data_to_import['drug_cas_wdid'].notnull())|
                                               ((potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_cui_wdid'])&
                                               potential_data_to_import['drug_cas_wdid'].notnull())|
                                               ((potential_data_to_import['drug_cui_wdid']==potential_data_to_import['drug_chebi_wdid'])&
                                               potential_data_to_import['drug_cui_wdid'].notnull())]
print('Potential DID entries for import with two or more WDID mappings:',len(drug_wdid_strict))

## Further subset the data to entries where the drug is verified by mapping to the same WDID via three different identifiers
drug_wdid_strictest = potential_data_to_import.loc[(potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_chebi_wdid'])&
                                                 potential_data_to_import['drug_cas_wdid'].notnull()&
                                                (potential_data_to_import['drug_cas_wdid']==potential_data_to_import['drug_cui_wdid'])]

print('Potential DID entries for import with three or more WDID mappings:',len(drug_wdid_strictest))
#print(drug_wdid_strictest.head(n=2))


Potential DID entries for import with only a single drug WDID mapping: 1058
Potential DID entries for import with conflicting WDID mappings: 926
Potential DID entries for import with two or more WDID mappings: 17855
Potential DID entries for import with three or more WDID mappings: 11091

Investigate the predicates for mapping to Wikidata properties


In [12]:
base_dataset = drug_wdid_strictest

predicates = base_dataset.groupby(['source_name','predicate_raw']).size().reset_index(name='counts')
predicates.sort_values('counts',ascending=False,inplace=True)
#predicates.to_csv('results/raw_predicates_less_markers.tsv',sep='\t',header=True)
print(predicates.loc[predicates['counts']>9].head(n=20))


    source_name                    predicate_raw  counts
0           CTD                      therapeutic    6088
100       NDFRT                        may_treat    1574
1           CTD  therapeutic ;; marker/mechanism    1305
87      MeSH_PA                             anti     578
2         ChEBI                             anti     383
112     USAN_TC                             anti     174
127     WHO_ATC                             anti     116
103         PDR                             anti     108
99        NDFRT                may_prevent&treat      74
136      WHO_DD                             anti      66
98        NDFRT                      may_prevent      65
82     DrugBank                        treatment      51
122     USAN_TC                        treatment      34
3         ChEBI                           causes      30
88      MeSH_PA                           causes      20
145    evoc_ATC                             anti      19
91      MeSH_PA                             hypo      16
129     WHO_ATC                              for      16
130     WHO_ATC                        relaxants      12
126     WHO_ATC                          against      10

In [13]:
predicates_aggregate = base_dataset.groupby(['source_name','predicate_aggregate']).size().reset_index(name='counts')
predicates_aggregate.sort_values('counts',ascending=False,inplace=True)
#predicates_aggregate.to_csv('results/aggregate_predicates_less_markers.tsv',sep='\t',header=True)
print(predicates_aggregate)


  source_name predicate_aggregate  counts
4    DrugBank                adju      41
1    DailyMed                adju      18
6     MeSH_PA            inhibits       7
0       ChEBI            inhibits       5
8     USAN_TC            inhibits       5
2    DailyMed              concom       1
3    DailyMed             history       1
5    DrugBank              concom       1
7         PDR                adju       1
9     USAN_TC              treats       1

In [14]:
predicates_string = base_dataset.groupby(['source_name','predicate_string']).size().reset_index(name='counts')
predicates_string.sort_values('counts',ascending=False,inplace=True)
print(predicates_string.head(n=5))


   source_name predicate_string  counts
75    DrugBank             1057      23
32    DrugBank                9       8
76     MeSH_PA       Inhibitors       7
45    DrugBank               59       7
36    DrugBank               15       7

In [15]:
## Investigate difference between predicates_raw and predicates_aggregate
predicate_aggregate_sample = base_dataset[['raw_drug_name','predicate_aggregate','indication_raw_string','umls_phen_PT']].loc[base_dataset['predicate_aggregate']=='inhibits']
print(predicate_aggregate_sample.head(n=2))

predicate_raw_sample = base_dataset[['raw_drug_name','predicate_raw','predicate_aggregate','indication_raw_string','umls_phen_PT']].loc[base_dataset['predicate_raw']=='therapeutic']
print(predicate_raw_sample.head(n=5))


     raw_drug_name predicate_aggregate    indication_raw_string  \
6449    amprenavir            inhibits   HIV protease inhibitor   
6456    amprenavir            inhibits  HIV Protease Inhibitors   

        umls_phen_PT  
6449  Hiv Infections  
6456  Hiv Infections  
            raw_drug_name predicate_raw predicate_aggregate  \
30023  3-keto-desogestrel   therapeutic                 NaN   
30117     4-Aminopyridine   therapeutic                 NaN   
30120     4-Aminopyridine   therapeutic                 NaN   
30121     4-Aminopyridine   therapeutic                 NaN   
30538            abacavir   therapeutic                 NaN   

      indication_raw_string        umls_phen_PT  
30023          Dysmenorrhea        Dysmenorrhea  
30117  Arrhythmias, Cardiac  Cardiac Arrhythmia  
30120           Hypotension         Hypotension  
30121    Multiple Sclerosis  Multiple Sclerosis  
30538           Hepatitis B         Hepatitis B  

Import results to Wikidata based on predicate type

  1. Set aside DID entries derived from CTD as the mapping of therapeutics is not clear
  2. Map remaining predicates to Wikidata properties in preparation for conversion and import

In [20]:
predicates_no_ctd = base_dataset.loc[base_dataset['source_name']!='CTD']
pred_freq_no_ctd = predicates_no_ctd.groupby('predicate_raw').size().reset_index(name='counts')
pred_freq_no_ctd.sort_values('counts',ascending=False,inplace=True)
print(pred_freq_no_ctd.head(n=15))


        predicate_raw  counts
42          may_treat    1574
19               anti    1446
82          treatment      99
41  may_prevent&treat      74
21             causes      67
40        may_prevent      65
30               hypo      26
29                for      24
63          relaxants      21
80              tonic      20
33              lytic      19
38         management      18
16            against      17
32          inhibitor      17
24               cide      15

In [16]:
## Spot check to see coverage of information 'may treats' data loaded to Wikidata
may_treats = base_dataset.loc[(base_dataset['predicate_raw']=='may_treat')|(base_dataset['predicate_raw']=='may_prevent&treat')|(base_dataset['predicate_raw']=='treatment')]
may_prevent = base_dataset.loc[(base_dataset['predicate_raw']=='may_prevent')|(base_dataset['predicate_raw']=='may_prevent&treat')]
causes = base_dataset.loc[base_dataset['predicate_raw']=='causes']
may_treats.to_csv('results/may_treat.tsv',sep='\t',header=True)
may_prevent.to_csv('results/may_prevent.tsv',sep='\t',header=True)
causes.to_csv('results/cause.tsv',sep='\t',header=True)
print(may_treats.head(n=10))

## Aluminum Hydroxied == no may_treats property, but has property 'has role' antacid
## Sodium bicarbonate == has 'medical treatments property' for GRD, cardiac arrest and dyspepsia, not for hyperkalemia, or drug overdose
## calcium acetate == has 'medical treatments property' for ckd, osteoporosis, hyperphosphatemia but not for hypocalcemia

In [ ]:
## Note that many 'subject has role' statements have already been imported into Wikidata,
## However, there doesn't appear to be any links from the antiagent to it's actual disease effect
## this can be imported from DID

#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='hypo'].head(n=26))
## All hypo predicates refer to hypoglycemic (anti-diabetic agents, treat like anti)

#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='for'].head(n=24))
## for predicates use with therapeutic area?

#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='relaxants'].head(n=21))
## subject has role central muscle relaxant (note that these don't include nmj blockers)
## use with therapeutic area

#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='lytic'].head(n=20))
## subject has role keratolytic
## use with therapeutic area

#print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='management'].head(n=20))
## use with therapeutic area

print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='treatment'].head(n=20))

In [197]:
### Best way to Model anti's? The antis are more generic in terms of therapeutic areas
### Subject has role(P2686) in Antibiotic | Antineoplastic
### Antineoplastic therapeutic area P4044 Malignant Neoplasms

#antitypes = predicates_no_ctd['indication_raw_string'].loc[predicates_no_ctd['predicate_raw']=='anti'].unique().tolist()
#print(len(antitypes))
print(predicates_no_ctd.loc[predicates_no_ctd['predicate_raw']=='anti'].head(n=2))
print(antitypes)


            raw_drug_name source_name drug_UMLS_CUI drug_ChEBI_ID  \
1338    indole-3-carbinol     MeSH_PA      C0063491         24814   
1525  abiraterone acetate       ChEBI      C2607886         68639   

        drug_cas# predicate_raw predicate_aggregate predicate_string  \
1338     700-06-1          anti                 NaN              NaN   
1525  154229-18-2          anti                 NaN              NaN   

      indication_raw_string umls_phen_cui         umls_phen_PT drug_cas_wdid  \
1338  Antineoplastic Agents      C0006826  Malignant Neoplasms      Q1770257   
1525   antineoplastic agent      C0006826  Malignant Neoplasms     Q27888393   

     drug_chebi_wdid drug_cui_wdid phen_cui_wdid  
1338        Q1770257      Q1770257        Q12078  
1525       Q27888393     Q27888393        Q12078  
['Antineoplastic Agents', 'antineoplastic agent', 'antihypertensive agent', 'antineoplastic', 'Anti-Inflammatory Agents', 'SKIN & MUCUOUS MEMBRANE AGENTS; ANTIPRURITICS', 'non-steroidal anti-inflammatory drug', 'ANTIPARKINSONIAN AGENTS; DOPAMINERGIC AGENTS', 'anti-anginal', 'antiglaucoma drug', 'antidepressant', 'anti-asthmatic drug', 'GASTROINTESTINAL AGENTS; ANTI-INFLAMMATORY AGENTS', 'ANTIPARKINSONIAN AGENTS; ANTICHOLINERGIC AGENTS', 'antiparkinsonian', 'antiparkinson drug', 'antihypertensive (beta-blocker)', 'Anti-Arrhythmia Agents', 'Antihypertensive Agents', 'antifungal agent', 'antifungal', 'PSYCHOTHERAPEUTIC AGENTS; ANTIANXIETY AGENTS; BENZODIAZEPINES & COMBINATIONS', 'PSYCHOTHERAPEUTIC AGENTS; ANTIDEPRESSANTS; SELECTIVE SEROTONIN REUPTAKE INHIBITORS (SSRI)', 'antidepressant (selective serotonin reuptake inhibitor)', 'antipruritic drug', 'anti-inflammatory', 'Anti-Asthmatic Agents', 'Indications and Usage Dexamethasone Sodium Phosphate Injection is indicated as a rapid adrenal glucocorticoid and/or anti-inflammatory agent in horses,', 'ANALGESICS; NONSTEROIDAL ANTI-INFLAMMATORY DRUGS (NSAIDS)', 'OPHTHALMIC PREPARATIONS; ANTI-INFLAMMATORY AGENTS; NONSTEROIDAL ANTI-INFLAMMATORY DRUGS (NSAIDS)', 'antispasmodic drug', 'antihypertensive', 'ANTI-INFECTIVE AGENTS, SYSTEMIC; ANTIPROTOZOAL AGENTS', 'ANTIARRHYTHMICS', 'antipyretic', 'Antipsychotic Agents', 'RESPIRATORY AGENTS; ANTI-INFLAMMATORY AGENTS; STEROIDAL ANTI-INFLAMMATORY AGENTS & COMBINATIONS', 'ANTINEOPLASTICS ; HORMONAL AGONISTS/ANTAGONISTS; GONADOTROPIN RELEASING HORMONE (GNRH) ANALOGUES', 'RESPIRATORY AGENTS; ANTITUSSIVES; NARCOTIC ANTITUSSIVES & COMBINATIONS', 'antitussive', 'ANTINEOPLASTICS ; MISCELLANEOUS ANTINEOPLASTICS', 'Anti-Inflammatory Agents, Non-Steroidal', 'Antidepressive Agents', 'OPHTHALMIC PREPARATIONS; ANTI-INFLAMMATORY AGENTS; STEROIDAL ANTI-INFLAMMATORY AGENTS & COMBINATIONS', 'ANTINEOPLASTICS ; ALKYLATING AGENTS; NITROGEN MUSTARDS', 'Antineoplastic Agents, Hormonal', 'ANTINEOPLASTICS ; STEROIDS & COMBINATIONS', 'antitrichomonal drug', 'Antineoplastic Agents, Alkylating', 'URINARY TRACT AGENTS; ANTISPASMODICS', 'antiprotozoal drug', 'PSYCHOTHERAPEUTIC AGENTS; ANTIDEPRESSANTS; TRICYCLIC ANTIDEPRESSANTS & COMBINATIONS', 'PSYCHOTHERAPEUTIC AGENTS; ANTIPSYCHOTIC AGENTS; MISCELLANEOUS ANTIPSYCHOTIC AGENTS', 'ANTIDIABETIC AGENTS; THIAZOLIDINEDIONES & COMBINATIONS', 'antidiabetic', 'Antifungal Agents', 'ANTIPARKINSONIAN AGENTS; MONOAMINE OXIDASE INHIBITORS (MAOI)', 'PSYCHOTHERAPEUTIC AGENTS; ANTIANXIETY AGENTS; MISCELLANEOUS ANTIANXIETY AGENTS', 'NASAL PREPARATIONS; ANTI-INFLAMMATORY AGENTS; STEROIDAL ANTI-INFLAMMATORY AGENTS', 'PSYCHOTHERAPEUTIC AGENTS; ANTIDEPRESSANTS; MISCELLANEOUS ANTIDEPRESSANTS', 'Anti-HIV Agents', 'Antipyretics', 'Anthelmintics', 'Antiplatyhelmintic Agents', 'Antiprotozoal Agents', 'ANTI-INFECTIVE AGENTS, SYSTEMIC; ANTHELMINTICS', 'anthelmintic', "antineoplastic used in the treatment of AIDS-related Kaposi's sarcoma and in the treatment of acute promyelocytic leukemia", 'Anti-Anxiety Agents', 'Anti-Dyskinesia Agents', 'Antiparkinson Agents', 'Antidepressive Agents, Tricyclic', 'antimalarial', 'Antimalarials', 'Antidepressive Agents, Second-Generation', 'ANTINEOPLASTICS ; HORMONAL AGONISTS/ANTAGONISTS; ANTIESTROGENS', 'antifungal used in the treatment of infections caused by Candida (all species), Aspergillus, and Pneumocystis', 'antipsoriatic', 'ANTINEOPLASTICS ; ADJUNCT ANTINEOPLASTIC THERAPY', 'Anticholesteremic Agents', 'antineoplastic, inhibitor of VEGF/PDGF tyrosine kinases', 'Antimetabolites, Antineoplastic', 'antipsychotic agent', 'anti-asthmatic agent', 'antineoplastic (Src kinase inhibitor)', 'ANTINEOPLASTICS ; ALKYLATING AGENTS; MISCELLANEOUS ALKYLATING AGENTS', 'Antitussive Agents', 'ANTINEOPLASTICS ; ANTIMETABOLITES', 'Antipruritics', 'Anti-Infective Agents, Urinary', 'ANTINEOPLASTICS ; ANTIBIOTICS', 'ANTIPARKINSONIAN AGENTS; CATECHOL-O-METHYLTRANSFERASE INHIBITORS', 'Antineoplastic Agents, Phytogenic', 'SKIN & MUCUOUS MEMBRANE AGENTS; ANTINEOPLASTICS', 'ANTINEOPLASTICS ; HORMONAL AGONISTS/ANTAGONISTS; ANTIANDROGENS', 'ANTI-INFECTIVE AGENTS, SYSTEMIC; ANTIFUNGALS', 'Antisickling Agents', 'antihypertensive (angiotensin II receptor antagonist)', 'antineoplastic (blocks PDGF receptor function, inhibiting the growth and survival of human tumor cells when administered intravenously)', 'antithyroid drug', 'Antithyroid Agents', 'pharmaceutic aid (antifungal agent)', 'antihypertensive (beta-blocker, ophthalmic)', 'anti-HIV agent', 'ANTIDIABETIC AGENTS; MEGLITINIDES', 'antidiabetic used in the treatment of type II diabetes mellitus', 'anti-inflammatory agent', 'Anti-Obesity Agents', 'ANTI-INFECTIVE AGENTS, SYSTEMIC; ANTIMALARIAL AGENTS', 'antidiabetic (oral hypoglycemic agent)', 'PSYCHOTHERAPEUTIC AGENTS; ANTIPSYCHOTIC AGENTS; PHENOTHIAZINES & COMBINATIONS', 'Antitrichomonal Agents', 'antiprotozoal', 'Uses Antibacterial hand soap', 'antineoplastic, histone deacetylase inhibitor', 'ANTHELMINTICS', 'ANTIDEPRESSANTS', 'Antidotes', 'ANTIHYPERTENSIVES', 'ANTIMALARIALS', 'ANTIMYCOBACTERIALS', 'ANTIPROTOZOALS', 'ANTIPSORIATICS', 'ANTIPSYCHOTICS', 'OTHER ANALGESICS AND ANTIPYRETICS', 'ANTITHROMBOTIC AGENTS']

In [ ]: